Code
library(tidyverse)
library(here)
library(knitr)
library(kableExtra)
library(gt)
library(DT)
names <- read_csv(here("Labs", "Lab 9", "StateNames_A.csv"))library(tidyverse)
library(here)
library(knitr)
library(kableExtra)
library(gt)
library(DT)
names <- read_csv(here("Labs", "Lab 9", "StateNames_A.csv"))DT::datatable(names)Make a summary table of the number of babies named “Allison” for each state and the sex of the baby. Specifically, the table should have the following qualities: - each state should be its own row - and each sex should have its own column - if there were no babies born for that combination of state & sex there should be a 0 (not an NA)
names |>
rename(Sex = Gender) |>
filter(Name == "Allison") |>
select(State, Sex, Count) |>
group_by(State, Sex) |>
summarize(
Count = sum(Count)
) |>
pivot_wider(names_from = Sex,
values_from = Count,
values_fill = 0) |>
knitr::kable(format = "html",
col.names =
c("State", "Female", "Male"),
caption = "Number of Babies Born with the Name Allison By Sex and State"
) |>
kableExtra::kable_classic(html_font = "Optima") |>
kable_styling(bootstrap_options = "striped", font_size = 14)| State | Female | Male |
|---|---|---|
| AK | 232 | 0 |
| AL | 1535 | 0 |
| AR | 1198 | 0 |
| AZ | 1880 | 0 |
| CA | 12413 | 0 |
| CO | 1594 | 0 |
| CT | 1099 | 0 |
| DC | 321 | 0 |
| DE | 294 | 0 |
| FL | 4455 | 0 |
| GA | 3257 | 0 |
| HI | 183 | 0 |
| IA | 1477 | 0 |
| ID | 451 | 0 |
| IL | 5110 | 0 |
| IN | 3067 | 0 |
| KS | 1283 | 0 |
| KY | 1905 | 20 |
| LA | 1209 | 0 |
| MA | 2218 | 0 |
| MD | 2229 | 0 |
| ME | 340 | 0 |
| MI | 4014 | 0 |
| MN | 2374 | 0 |
| MO | 2882 | 0 |
| MS | 817 | 0 |
| MT | 226 | 0 |
| NC | 3435 | 0 |
| ND | 285 | 0 |
| NE | 807 | 0 |
| NH | 412 | 0 |
| NJ | 3052 | 0 |
| NM | 399 | 0 |
| NV | 729 | 0 |
| NY | 5747 | 0 |
| OH | 5487 | 0 |
| OK | 1421 | 0 |
| OR | 1186 | 0 |
| PA | 4307 | 0 |
| RI | 306 | 0 |
| SC | 1228 | 0 |
| SD | 376 | 0 |
| TN | 2488 | 0 |
| TX | 10192 | 0 |
| UT | 1125 | 0 |
| VA | 3220 | 0 |
| VT | 135 | 0 |
| WA | 1956 | 0 |
| WI | 2367 | 0 |
| WV | 813 | 0 |
| WY | 142 | 0 |
You should have seen in the table above that “Allison” is a name given overwhelmingly to babies assigned “female” at birth. So, create a new dataset named allison_f which contains only the babies assigned Female at birth.
allison_f <- names |>
rename("Sex" = "Gender") |>
filter(Name == "Allison") |>
select(Year, Sex, State, Count)Make a visualization showing how the popularity of the name “Allison” has changed over the years. To be clear, each year should have one observation–the total number of Allisons born that year.
allison_f |>
group_by(Year) |>
summarize(
Count = sum(Count)
) |>
ggplot(mapping = (aes(x = Year, y = Count))) +
geom_line(color = "darkblue") +
scale_x_continuous(breaks = c("1997":"2014"),
guide = guide_axis(n.dodge = 2)) +
scale_y_continuous(n.breaks = 8) +
labs(x = "Year", y = "",
title = "Number of Babies Named Allison from 1997 to 2014") +
theme(text = element_text(family = "Optima"))#https://ggplot2.tidyverse.org/reference/scale_continuous.htmlFit a linear model with the year as the explanatory variable, and the number of Allisons as the response. Similar to #3, each year should have one observation–the total number of Allisons born that year.
allison_f |>
group_by(Year) |>
summarize(
Count = sum(Count)
) |>
ggplot(mapping = aes(x = Year, y = Count)) +
geom_point(color = "darkblue") +
geom_smooth(method = "lm", fill = "lightblue", color = "aquamarine4") +
scale_x_continuous(breaks = c("1997":"2014"),
guide = guide_axis(n.dodge = 2)) +
scale_y_continuous(n.breaks = 8) +
labs(x = "Year", y = "Count", title = "Number of Babies Named Allison from 1997 to 2014") +
theme(text = element_text(family = "Optima"))allison_f |>
group_by(Year) |>
summarize(
Count = sum(Count)
) |>
lm(Count ~ Year,
data = _)
Call:
lm(formula = Count ~ Year, data = summarize(group_by(allison_f,
Year), Count = sum(Count)))
Coefficients:
(Intercept) Year
209815.1 -101.6
The regression equation is:
predicted number of babies = 209815.1 - 101.6(Year)
allison_f |>
group_by(Year) |>
summarize(
Count = sum(Count)
) |>
lm(Count ~ Year,
data = _) |>
broom::augment() |>
ggplot(mapping = aes(y = .resid, x = .fitted)) +
geom_point(color = "darkslategray4") +
labs(x = "Fitted", y = "Residuals",
title = "Residuals vs. Fitted Model on Number of Babies Named Allison from 1997 to 2014") +
theme(text = element_text(family = "Optima"))What do you conclude from this model? Is my name not cool anymore?
Based on the residuals vs. fitted model, there appears to be a curved pattern. This demonstrates that there is a nonlinear relationship between the number of babies named Allison across the years. This proves that the number of babies named Allison have not been growing since the previous years.
Narrow the A name dataset (downloaded previously) down to only male-assigned babies named “Allan”, “Alan”, or “Allen”. Make a plot comparing the popularity of these names over time.
names |>
rename("Sex" = "Gender") |>
filter(Name %in% c("Allan", "Alan", "Allen"),
Sex == "M") |>
group_by(Name, Year) |>
summarize(
Count = sum(Count)
) |>
ggplot(mapping = aes(x = Year, y = Count, color = Name)) +
geom_line() +
scale_y_continuous(n.breaks = 8) +
scale_color_manual(values = c("darkslategray4", "aquamarine3", "cyan2")) +
labs(x = "Year", y = "Count",
title = "Number of Babies Named Alan, Allan, or Allen from 1997 to 2014") +
theme(text = element_text(family = "Optima"))In California, Allan’s spelling of his name is the least common of the three but perhaps it’s not such an unusual name for his home state of Pennsylvania. Compute the total number of babies born with each spelling of “Allan” in the year 2000, in Pennsylvania and in California. Specifically, the table should have the following qualities: - each spelling should be its own column - each state should have its own row - a 0 (not an NA) should be used to represent locations where there were no instances of these names
names |>
rename("Sex" = "Gender") |>
filter(Year == 2000,
State %in% c("CA", "PA"),
Name %in% c("Allan", "Alan", "Allen"),
Sex == "M"
) |>
select(Name, State, Count) |>
group_by(Name)|>
pivot_wider(names_from = Name,
values_from = Count)|>
knitr::kable(format = "html",
col.names =
c("State", "Alan", "Allen", "Allan"),
caption = "Number of Male Babies Born in 2000 by Name and State"
) |>
kableExtra::kable_classic(html_font = "Optima") |>
kable_styling(bootstrap_options = "striped", font_size = 14)| State | Alan | Allen | Allan |
|---|---|---|---|
| CA | 579 | 176 | 131 |
| PA | 51 | 56 | 12 |
Convert your total counts to overall percents. That is, what was the percent breakdown between the three spellings in CA? What about in PA?
names |>
rename(Sex = Gender) |>
filter(Year == 2000,
State %in% c("CA", "PA"),
Name %in% c("Allan", "Alan", "Allen"),
Sex == "M"
) |>
group_by(Name)|>
mutate(
Percentage = Count/sum(Count)
) |>
select(Name, State, Percentage) |>
pivot_wider(names_from = Name,
values_from = Percentage)|>
gt() |>
tab_header(title = "Percentage of Male Babies Born in 2000 by Name and State") |>
fmt_percent(columns = 2:4, decimals = 2) | Percentage of Male Babies Born in 2000 by Name and State | |||
| State | Alan | Allen | Allan |
|---|---|---|---|
| CA | 91.90% | 75.86% | 91.61% |
| PA | 8.10% | 24.14% | 8.39% |